@InProceedings{CostaFigTeiLimTei:2021:HaPoEs,
author = "Costa, Willams and Figueiredo, Lucas and Teixeira, Jo{\~a}o
Marcelo and Lima, Jo{\~a}o Paulo and Teichrieb, Veronica",
affiliation = "Voxar Labs, Centro de Inform{\'a}tica, Universidade Federal de
Pernambuco and Voxar Labs, Centro de Inform{\'a}tica,
Universidade Federal de Pernambuco and Voxar Labs, Centro de
Inform{\'a}tica, Universidade Federal de Pernambuco and
Departamento de Computa{\c{c}}{\~a}o, Universidade Federal Rural
de Pernambuco and Voxar Labs, Centro de Inform{\'a}tica,
Universidade Federal de Pernambuco",
title = "An Investigation of 2D Keypoints Detection on Challenging
Scenarios Using Depthwise Separable Convolutions: A Hand Pose
Estimation Case Study",
booktitle = "Proceedings...",
year = "2021",
editor = "Paiva, Afonso and Menotti, David and Baranoski, Gladimir V. G. and
Proen{\c{c}}a, Hugo Pedro and Junior, Antonio Lopes Apolinario
and Papa, Jo{\~a}o Paulo and Pagliosa, Paulo and dos Santos,
Thiago Oliveira and e S{\'a}, Asla Medeiros and da Silveira,
Thiago Lopes Trugillo and Brazil, Emilio Vital and Ponti, Moacir
A. and Fernandes, Leandro A. F. and Avila, Sandra",
organization = "Conference on Graphics, Patterns and Images, 34. (SIBGRAPI)",
publisher = "IEEE Computer Society",
address = "Los Alamitos",
keywords = "real-time hand pose estimation, human-computer interaction,
depthwise separable convolutions.",
abstract = "2D keypoints detection is a computer vision task applicable to
several fields such as hand, face, and body tracking, which
provides useful information for spatial analytics, gestural
interactions, and augmented reality applications. This work
investigates the usage of depthwise separable convolutions (an
optimized convolution operation) to speed up the inference time on
a largely used architecture for 2D keypoints estimation. We
evaluate the impacts on the precision and performance of such
optimization on a hand pose estimation task. We also extend the
evaluation towards simulated challenging scenarios of defocused
lens, motion blur, occlusions, and noisy images to understand how
these stress situations affect both the original and the optimized
architectures. We show that the execution time can be improved on
average by 12.8\% with an accuracy compromise of less than 1
pixel (mean EPE). The experiments on challenging scenarios
revealed that the model powered by depthwise separable
convolutions is most fit for the occlusion cases and noisy
environments while suffering more on the motion blur simulated
scenarios.",
conference-location = "Gramado, RS, Brazil (virtual)",
conference-year = "18-22 Oct. 2021",
doi = "10.1109/SIBGRAPI54419.2021.00017",
url = "http://dx.doi.org/10.1109/SIBGRAPI54419.2021.00017",
language = "en",
ibi = "8JMKD3MGPEW34M/45C6J72",
url = "http://urlib.net/ibi/8JMKD3MGPEW34M/45C6J72",
targetfile = "Hand3d_camera-ready.pdf",
urlaccessdate = "2024, May 06"
}